There are a number of factors that make you happy! The World Happiness Report numerates the happiness index by country with a number of factors considered. We utilized Machine Learning techniques to conduct further analysis to explore the level of impacts. After cleaning and training our dataset, we ran a multi-regression analysis and created a happiness formula to better explain the correlations among the happiness factors and happiness score. We analyzed/identified that seven of these factors have a more significant impact on happiness score. We discovered that freedom and life expectancy were the factors that had the biggest impact, and generosity has a surprisingly smaller effect on happiness score. The correlations among those factors are all positive as well, which explains the existence of ripple effects These possible gaps among countries on the happiness index can be expected to vary depending on the inputs/factors involved.
# Update sklearn to prevent version mismatches
# !conda install scikit-learn
# !conda update scikit-learn
# !conda install joblib
# !conda update joblib
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Drop the null columns where all values are null
df = pd.read_csv("whd-2015-19.csv")
df = df.dropna(axis='columns', how='all')
new_df_w_continent=df.drop(['country','happiness_rank','year'],axis=1)
new_df_w_continent = new_df_w_continent.apply(lambda x: x.fillna(0),axis=0)
new_df=df.drop(['country','happiness_rank','continent','year'],axis=1)
new_df = new_df.apply(lambda x: x.fillna(0),axis=0)
new_df.head()
| happiness_score | gdp_per_capita | family | life_expectancy | freedom | generosity | government_corr | social_support | |
|---|---|---|---|---|---|---|---|---|
| 0 | 7.587 | 1.39651 | 1.34951 | 0.94143 | 0.66557 | 0.29678 | 0.41978 | 0.0 |
| 1 | 7.561 | 1.30232 | 1.40223 | 0.94784 | 0.62877 | 0.43630 | 0.14145 | 0.0 |
| 2 | 7.527 | 1.32548 | 1.36058 | 0.87464 | 0.64938 | 0.34139 | 0.48357 | 0.0 |
| 3 | 7.522 | 1.45900 | 1.33095 | 0.88521 | 0.66973 | 0.34699 | 0.36503 | 0.0 |
| 4 | 7.427 | 1.32629 | 1.32261 | 0.90563 | 0.63297 | 0.45811 | 0.32957 | 0.0 |
new_df.describe()
| happiness_score | gdp_per_capita | family | life_expectancy | freedom | generosity | government_corr | social_support | |
|---|---|---|---|---|---|---|---|---|
| count | 782.000000 | 782.000000 | 782.000000 | 782.000000 | 782.000000 | 782.000000 | 782.000000 | 782.000000 |
| mean | 5.379018 | 0.916047 | 0.595221 | 0.612416 | 0.411091 | 0.218576 | 0.125275 | 0.483171 |
| std | 1.127456 | 0.407340 | 0.544504 | 0.248309 | 0.152880 | 0.122321 | 0.105844 | 0.622930 |
| min | 2.693000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 4.509750 | 0.606500 | 0.000000 | 0.440183 | 0.309768 | 0.130000 | 0.054000 | 0.000000 |
| 50% | 5.322000 | 0.982205 | 0.698400 | 0.647310 | 0.431000 | 0.201982 | 0.090905 | 0.000000 |
| 75% | 6.189500 | 1.236187 | 1.082465 | 0.808000 | 0.531000 | 0.278832 | 0.155861 | 1.174000 |
| max | 7.769000 | 2.096000 | 1.610574 | 1.141000 | 0.724000 | 0.838075 | 0.551910 | 1.644000 |
X = new_df.drop("happiness_score", axis=1)
y = new_df.happiness_score
print(X.shape, y.shape)
X
(782, 7) (782,)
| gdp_per_capita | family | life_expectancy | freedom | generosity | government_corr | social_support | |
|---|---|---|---|---|---|---|---|
| 0 | 1.39651 | 1.34951 | 0.94143 | 0.66557 | 0.29678 | 0.41978 | 0.000 |
| 1 | 1.30232 | 1.40223 | 0.94784 | 0.62877 | 0.43630 | 0.14145 | 0.000 |
| 2 | 1.32548 | 1.36058 | 0.87464 | 0.64938 | 0.34139 | 0.48357 | 0.000 |
| 3 | 1.45900 | 1.33095 | 0.88521 | 0.66973 | 0.34699 | 0.36503 | 0.000 |
| 4 | 1.32629 | 1.32261 | 0.90563 | 0.63297 | 0.45811 | 0.32957 | 0.000 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 777 | 0.35900 | 0.00000 | 0.61400 | 0.55500 | 0.21700 | 0.41100 | 0.711 |
| 778 | 0.47600 | 0.00000 | 0.49900 | 0.41700 | 0.27600 | 0.14700 | 0.885 |
| 779 | 0.35000 | 0.00000 | 0.36100 | 0.00000 | 0.15800 | 0.02500 | 0.517 |
| 780 | 0.02600 | 0.00000 | 0.10500 | 0.22500 | 0.23500 | 0.03500 | 0.000 |
| 781 | 0.30600 | 0.00000 | 0.29500 | 0.01000 | 0.20200 | 0.09100 | 0.575 |
782 rows × 7 columns
y
0 7.587
1 7.561
2 7.527
3 7.522
4 7.427
...
777 3.334
778 3.231
779 3.203
780 3.083
781 2.853
Name: happiness_score, Length: 782, dtype: float64
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
#creating the model using LinearRegression
from sklearn.linear_model import LinearRegression
model_n = LinearRegression()
# Fit the model to the training data and calculate the scores for the training and testing data
model_n.fit(X_train, y_train)
training_score = model_n.score(X_train, y_train)
testing_score = model_n.score(X_test, y_test)
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")
Training Score: 0.7804958836281554 Testing Score: 0.7255338949205835
print("Intercept", model_n.intercept_)
Intercept 2.1709104274842916
print("Coefficients", model_n.coef_)
Coefficients [0.95038985 0.83456657 1.15166485 1.59090688 0.21018595 1.03319443 0.65508653]
coef = zip(X.columns, model_n.coef_)
coef_df = pd.DataFrame(list(zip(X.columns, model_n.coef_)), columns=["Features","Coefficients"])
coef_df
| Features | Coefficients | |
|---|---|---|
| 0 | gdp_per_capita | 0.950390 |
| 1 | family | 0.834567 |
| 2 | life_expectancy | 1.151665 |
| 3 | freedom | 1.590907 |
| 4 | generosity | 0.210186 |
| 5 | government_corr | 1.033194 |
| 6 | social_support | 0.655087 |
# Plot the Residuals for the Training and Testing data
plt.scatter(model_n.predict(X_train), model_n.predict(X_train) - y_train, c="blue", label="Training Data")
plt.scatter(model_n.predict(X_test), model_n.predict(X_test) - y_test, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y.min(), xmax=y.max())
plt.title("Residual Plot")
Text(0.5, 1.0, 'Residual Plot')
import seaborn as sns
#number of variables for heatmap
k = 7
corrmat = new_df.corr()
cols = corrmat.nlargest(k, "happiness_score")["happiness_score"].index
cm = np.corrcoef(new_df[cols].values.T)
f, ax = plt.subplots(figsize=(20, 12))
sns.set(font_scale=1.4)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 12},
yticklabels=cols.values, xticklabels=cols.values,vmax=1, vmin=0, cmap='YlGnBu')
hm.set_ylim([7,0])
g0 = plt.show()
f.savefig('sns_heatmap.png')
# with regression
g1 = sns.pairplot(new_df, kind="reg")
g1.savefig('sns_pairplot1.png')
# without regression
g2 = sns.pairplot(new_df, kind="scatter")
# plt.show()
g2.savefig('sns_pairplot2.png')
g3 = sns.pairplot(new_df, diag_kind="kde")
g3.savefig('sns_pairplot3.png')
g4 = sns.pairplot(new_df, diag_kind="kde", markers="+",
plot_kws=dict(s=50, edgecolor="b", linewidth=1),
diag_kws=dict(shade=True))
g4.savefig('sns_pairplot4.png')
g5 = sns.PairGrid(new_df)
g5.map_upper(plt.scatter)
g5.map_lower(sns.kdeplot)
g5.map_diag(sns.kdeplot, lw=3, legend=False);
g5.savefig('sns_pairplot5.png')
g6 = sns.pairplot(new_df_w_continent, hue="continent", palette="Set2", diag_kind="kde", height=2.5)
g6.savefig('sns_pairplot6.png')
g7 = sns.PairGrid(new_df_w_continent, hue="continent", palette="Set2", height=2.5)
g7.map_upper(plt.scatter)
g7.map_lower(sns.kdeplot)
g7.map_diag(sns.kdeplot, lw=3, legend=False);
g7 = g7.add_legend()
g7.savefig('sns_pairplot7.png')